In [ ]:
# Display figures in iPython notebook
%pylab inline

Similarity-based Learning

References

  1. Zhitao Yin gave a great introduction to text mining https://github.com/zhitaoyin/GSU-Text-Mining-Workshop-Fall-2015
  2. Scikit-Learn online documentation http://scikit-learn.org/stable/documentation.html
  3. scikit-learn Cookbook http://it-ebooks.info/book/4664/
  4. Mastering Machine Learning with scikit-learn http://it-ebooks.info/book/4315/
  5. Classification of text documents using sparse features http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

Doing things from scratch


In [5]:
from math import sqrt
import numpy as np
from numpy.random import rand

In [6]:
import pandas as pd

In [ ]:
pd.read_csv()

In [7]:
%cp ../03-InformationBased/car.data .
%cp ../03-InformationBased/car.names .

In [20]:
df = pd.read_csv("data/bank.csv", sep=';')
df.columns


Out[20]:
Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [35]:
X = df[['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']].as_matrix()

In [41]:
def kNN_with_dataframes_majority(df, independent_cols, target_cols, query_df, k):
    X = df[independent_cols].as_matrix()
    Nsample = np.shape(X)[0]
    Q = query_df[independent_cols].as_matrix()
    
    for j in range(0, len(X)):
        dis = distance(q, X[j])
        d.append( ( dis, j) )

    d = sorted(d, key=lambda tp: tp[0])
    res = []
    for i in range(0, k):
        res.append(d[i][1])
    return res ##, d[:k]


Out[41]:
array([   19, -3313,     4,     1,    -1,     0])

In [45]:
np.shape(X)


Out[45]:
(4521, 6)

In [51]:
d = X[2] - np.array([23, 100, 89, 1, 232, 1])

In [57]:
d, d*d, sqrt(sum(d*d))


Out[57]:
(array([  12, 1250,   96,    0,   98,    0]),
 array([    144, 1562500,    9216,       0,    9604,       0]),
 1257.5627220938127)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
## most primitive Euclidian distance
def distance(vect1, vect2):
    s = 0
    ## se assume both vector have the same dimension
    for i in range(0, len(vect1)):
        delta = vect1[i]-vect2[i]
        s += delta*delta
    return math.sqrt(s)

distance([1, 2, 3], [-4, 2, 0])

In [ ]:
distance(rand(1000), rand(1000))

In [ ]:
## most primitive k-NN X=samples, q=query, k=number of neighbors
def kNN(X, q, k):
    d = []
    for j in range(0, len(X)):
        dis = distance(q, X[j])
        d.append( ( dis, j) )

    d = sorted(d, key=lambda tp: tp[0])
    res = []
    for i in range(0, k):
        res.append(d[i][1])
    return res ##, d[:k]

In [ ]:
data = np.random.rand(1000,10)
q = np.random.rand(10)
kNN(data, q, 5)

In [ ]:
## let's try this out on a really big set
import time

data = np.random.randrand(10000,10000)
q = np.random.randrand(10000)

t0 = time.time()
kNN(data, q, 3)
t1 = time.time()

print("This took %.3f seconds" % (t1-t0))

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

The following steps are from Zhitao's example:


In [ ]:
# Import matplotlib package to plot figures
import matplotlib.pyplot as plt

In [ ]:
# Import seaborn package to make figrures look better
import seaborn as sns

In [ ]:
# Import pandas package to store and manipulate data
import pandas as pd

In [ ]:
# Import numpy and scipy packages to do scientific analysis
import numpy as np
import scipy as sp
import scipy.stats

In [ ]:
# Import csv package to convert pandas dataframe to csv file
import csv

In [ ]:
# Import chain package to do iteration 
from itertools import chain

In [ ]:
# Import Counter package to do counting
from collections import Counter, defaultdict

In [ ]:
# Import operator package to sort a dictionary by its values
import operator

In [ ]:
# Import re package to implement regular expression
import re

In [ ]:
# Import timer
import time

In [ ]:
# Import topic model packages
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import ldamodel;
from gensim import matutils;

In [ ]:
# Import NLTK package
from nltk import sent_tokenize,word_tokenize,porter
from nltk import PorterStemmer
from nltk.corpus import stopwords

In [ ]:


In [ ]:
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Olivier Grisel <olivier.grisel@ensta.org>
#         Mathieu Blondel <mathieu@mblondel.org>
#         Lars Buitinck <L.J.Buitinck@uva.nl>
# License: BSD 3 clause

##from __future__ import print_function

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
#from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
#from sklearn.svm import LinearSVC
#from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import Perceptron
#from sklearn.linear_model import PassiveAggressiveClassifier
#from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
#from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

In [ ]:
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

class Option(object):
    pass
    
opts = Option()

#"Print a detailed classification report."
opts.print_report = True

#"Select some number of features using a chi-squared test"
opts.select_chi2 = 3


#"Print the confusion matrix.")
opts.print_cm = True

#"Print ten most discriminative terms per class for every classifier."
opts.print_top10=False

#"Whether to use all categories or not."
opts.all_categories = True

#"Use a hashing vectorizer."
opts.use_hashing = True

#"n_features when using the hashing vectorizer."
opts.n_features = 2 ** 16

#"Remove newsgroup information that is easily overfit: headers, signatures, and quoting."
opts.filtered = True

In [ ]:
###############################################################################
# Load some categories from the training set
if opts.all_categories:
    categories = None
else:
    categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]

if opts.filtered:
    remove = ('headers', 'footers', 'quotes')
else:
    remove = ()

In [ ]:
print("Loading 20 newsgroups dataset for categories:")
print(categories if categories else "all")

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)
print('data loaded')

In [ ]:
data_train.data[3]

In [ ]:
data_train.target_names

In [ ]:
categories = data_train.target_names    # for case categories == None


def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

data_train_size_mb = size_mb(data_train.data)
data_test_size_mb = size_mb(data_test.data)

print("%d documents - %0.3fMB (training set)" % (
    len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (
    len(data_test.data), data_test_size_mb))
print("%d categories" % len(categories))
print()

In [ ]:
data_train.target_names

In [ ]:


In [ ]:
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()

### use hashing?
if opts.use_hashing:
    vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                   n_features=opts.n_features)
    X_train = vectorizer.transform(data_train.data)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
duration = time() - t0

In [ ]:
shape(X_train), len(data_train.data)

In [ ]:


In [ ]:
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

In [ ]:
# mapping from integer feature name to original token string
if opts.use_hashing:
    feature_names = None
else:
    feature_names = vectorizer.get_feature_names()

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
    print("done in %fs" % (time() - t0))
    print()

if feature_names:
    feature_names = np.asarray(feature_names)


def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

In [ ]:
###############################################################################
# Benchmark classifiers
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, category in enumerate(categories):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s"
                      % (category, " ".join(feature_names[top10]))))
        print()

    if opts.print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred,
                                            target_names=categories))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

In [ ]:
results = benchmark(KNeighborsClassifier(n_neighbors=10))

In [ ]:


In [ ]:


In [ ]: